*-------------------------------------------------------------------------------
*						Appendix I Fig 2
*-------------------------------------------------------------------------------

** Set Path
global Raw_data    	"G:\project-finished\Descriptive\Data"
global App_data    	"G:\project-finished\Descriptive\Appendix Data"
global Class_data   "G:\project-finished\Descriptive\Classification"  
global Work_lab   	"G:\project-finished\Descriptive\Lab"
global Out_lab    	"G:\project-finished\Descriptive\Out" 

cd "$Work_lab"
                            
capture log close            
log using "$Out_lab\Appendix I Fig 2", replace 
set more off     


**------------------------------------------------------------------------------
* Step1: Generate Data
* 						 (1)Long Table Census Data @2000 Industry*OCC
**------------------------------------------------------------------------------
clear
set obs 0
save tabledata1,replace emptyok

*Industry*OCC
import excel "$App_data\职业和行业就业分布.xlsx", sheet("2000行业-职业交叉")   firstrow clear
keep in 1/18

drop if 行业分类==""
rename (一国家机关党群组织企业事业单位负责人 二专业技术人员 三办事人员和有关人员 四商业服务业人员 五农林牧渔水利业生产人员 六生产运输设备操作人员及有关人员 七不便分类的其他劳动者) (num1 num2 num3 num4 num5 num6 num7)
keep 行业分类 num*
replace 行业分类=subinstr(行业分类," ","",.)
drop if 行业分类=="总计"
gen industry_16=_n

reshape long num,i(industry*) j(occ_7)
destring num,gen(num_longtable)

keep industry_16 occ_7 num_longtable

append using tabledata1
save tabledata1,replace


**------------------------------------------------------------------------------
* Step1: Generate Data
* 						 (2)Census Data @2000 Industry*OCC
**------------------------------------------------------------------------------
use "$Raw_data\census2000.dta",clear

drop if industry == . | occ == .| industry==0 |occ==0

tostring industry,replace
replace industry="0"+industry if strlen(industry)==2
gen temp=substr(industry,1,2)
destring temp,replace
gen industry_20=1 if temp>=1 & temp<=5
replace industry_20=2 if temp>=6 & temp<=12
replace industry_20=3 if temp>=13 & temp<=43
replace industry_20=4 if temp>=44 & temp<=46
replace industry_20=5 if temp>=47 & temp<=49
replace industry_20=6 if temp>=50 & temp<=51
replace industry_20=7 if temp>=52 & temp<=60
replace industry_20=8 if temp>=61 & temp<=67
replace industry_20=9 if temp>=68 & temp<=70
replace industry_20=10 if temp>=72 & temp<=74
replace industry_20=11 if temp>=75 & temp<=84
replace industry_20=12 if temp>=85 & temp<=87
replace industry_20=13 if temp>=89 & temp<=91
replace industry_20=14 if temp>=92 & temp<=93
replace industry_20=15 if temp>=94 & temp<=97
replace industry_20=16 if temp==99
drop temp
rename industry_20 industry_16

tostring occ,replace
replace occ="0"+occ if strlen(occ)==2
gen temp=substr(occ,1,1)
destring temp,replace

gen occ_7=1 if temp==0
replace occ_7=2 if temp==1 | temp==2
replace occ_7=3 if temp==3
replace occ_7=4 if temp==4
replace occ_7=5 if temp==5
replace occ_7=6 if temp>=6  & temp <=9
replace occ_7=7 if occ=="999"
drop temp


gen num=_n
collapse (count) num,by(industry_16 occ_7)

merge 1:1 industry_16 occ_7 using tabledata1
drop _m

rename (num num_longtable) (paper longtable)
foreach var in paper longtable{
egen pop`var'=total(`var')
gen share`var'=`var'/pop`var'*100
drop pop`var' `var'
rename share`var' `var'
}

gen ratio=paper/longtable

save tabledata1,replace


**------------------------------------------------------------------------------
* Step1: Generate Data
* 						 (3)Long Table Census Data @2010 OCC
**------------------------------------------------------------------------------
clear
set obs 0
save tabledata2,replace emptyok


import excel "$App_data\职业和行业就业分布.xlsx", sheet("2010职业小类") firstrow clear

keep 分类 人数
destring 人数,replace
rename 人数 num_longtable

replace 分类=subinstr(分类," ","",.)
rename 分类 occ_title
duplicates drop
drop if occ_title=="总计"

append using tabledata2
save tabledata2,replace

**------------------------------------------------------------------------------
* Step1: Generate Data
* 						 (4)Census Data @2010 OCC
**------------------------------------------------------------------------------
use "$Raw_data\census2010.dta",clear

drop if _职业 == .

tostring _职业,gen(occ_2010)
replace occ_2010="0"+occ_2010 if strlen(occ_2010)==2

*get occupation title
merge m:1 occ_2010 using "2010_occ_consistent_characteristics.dta"

replace title_2010="不便分类的其他从业人员" if occ_2010=="999"
replace title_2010="其他检验、计量人员" if occ_2010=="939"
rename title_2010 occ_title

gen num=_n
collapse (count) num,by(occ_title)

merge 1:1 occ_title using tabledata2
keep if _m==3
drop _m 


rename (num num_longtable) (paper longtable)
foreach var in paper longtable{
egen pop`var'=total(`var')
gen share`var'=`var'/pop`var'*100
drop pop`var' `var'
rename share`var' `var'
}

gen ratio=paper/longtable

save tabledata2,replace


**------------------------------------------------------------------------------
* Step2: Plot Data
**------------------------------------------------------------------------------
use tabledata1,clear
twoway (kdensity ratio), ///
scheme(plotplain) xtitle("本文职业-行业的就业占比/统计局就业占比") ///
ytitle("核密度") xline(1,lc(red))  xlabel(,nogrid) ylabel(,nogrid)
graph save "$Out_lab\appI_Fig2a",replace
graph export "$Out_lab\appI_Fig2a.png",replace


use tabledata2,clear
twoway (kdensity ratio), ///
scheme(plotplain) xtitle("本文职业（小类）就业占比/统计局就业占比") ///
ytitle("核密度") xline(1,lc(red))  xlabel(,nogrid) ylabel(,nogrid)
graph save "$Out_lab\appI_Fig2b",replace
graph export "$Out_lab\appI_Fig2b.png",replace

erase tabledata1.dta
erase tabledata2.dta

log close


